Configuration Schemas

Dataset config schema

The dataset config is the base format for simulation configs (see below). A dataset config just specifies some generic details about the files in any dataset (whether or not it was produced by a simulation), and how to load them. The simulation config extends this by adding further details about the simulation which produced the dataset.

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/dataset.yml"
description: >-
    config file for a dataset--in this case a dataset is a collection of
    files containing SNP data organized in one of the dataset formats
    understood by DNADNA; this is used for both loading data from simulated
    data, or other datasets on which we want to perform prediction
type: "object"
properties:
    data_root:
        type: "string"
        format: "filename"
        description: >-
            root directory for all files related to the dataset, either as
            an absolute path, or as a path relative to the location of this
            config file

    dataset_name:
        type: "string"
        description: >-
            a name to give the dataset; used in generating filenames and
            logging output

    scenario_params_path:
        type: "string"
        format: "filename"
        description: >-
            path to the CSV file containing the per-scenario
            parameters used in this simulation, either as an
            absolute path, or as a path relative to this config file

    data_source:
        type: "object"
        description: >-
            options for describing the format in which the dataset is
            organized; currently only one format ("dnadna", the native
            format for DNADNA) is understood, but others may be added later
        properties:
            format:
                type: "string"
                description: >-
                    a unique label identifying the data format; the format
                    property determines what reader is used for simulation
                    data, and any further options in data_source may depend
                    on the format
                enum: ["dnadna"]
        required: ["format"]
        oneOf:
            - {"$ref": "py-pkgdata:dnadna.schemas/dataset_formats/dnadna.yml"}

    position_format:
        type: "object"
        description: >-
            options related to the format of the positions array; i.e. is it
            given as absolute positions in a chromosome, or distances between
            positions, is it normalized to [0.0, 1.0), etc.
        properties:
            chromosome_size:
                type: "integer"
                description: >-
                    number of base pairs in the chromsome; required only for
                    converting from normalized to un-normalized positions
            initial_position:
                type: ["integer", "number"]
                description: >-
                    initial position to use for circular chromosomes; when
                    converting from circular distances to positions for example
                    an initial position is needed
            distance:
                type: "boolean"
                description: >-
                    if true, the positions array represents distances between
                    positions, instead of absolute positions
            normalized:
                type: "boolean"
                description: >-
                    if true, the positions (whether they are absolute or
                    relative) are normalized to the range [0.0, 1.0);
                    in this case it is also necessary to provide the
                    chromosome size if it is needed to convert to the
                    un-normalized values
            circular:
                type: "boolean"
                description: >-
                    whether or not the chromosome is circular
                default: false

    ignore_missing_replicates:
      description: >-
          ignore missing replicates when loading
          data samples; in the case of missing samples the next
          one is tried until one is found
      type: "boolean"
      default: false

    ignore_missing_scenario:
      description: >-
          ignore missing scenario when loading
          data samples; a scenario is considered as missing 
          if all its replicates are missing or if the folder
          of the scenario doesn't exist.
      type: "boolean"
      default: true

    cache_validation_set:
        description: >-
            used only during training, keeps the validation set cached
            in-memory, which can greatly speed up evaluation; however, if the
            validation set is too large to fit in available memory this can be
            disabled
        type: "boolean"
        default: false

    dnadna_version: {"$ref": "py-pkgdata:dnadna.schemas/definitions.yml#/definitions/version"}

required:
    - data_root
    - dataset_name
    - data_source

Example

# config file for a dataset--in this case a dataset is a collection
# of files containing SNP data organized in one of the dataset
# formats understood by DNADNA; this is used for both loading data
# from simulated data, or other datasets on which we want to perform
# prediction

# used only during training, keeps the validation set cached in-
# memory, which can greatly speed up evaluation; however, if the
# validation set is too large to fit in available memory this can be
# disabled
cache_validation_set: false

# root directory for all files related to the dataset, either as an
# absolute path, or as a path relative to the location of this
# config file
data_root: /builds/ml_genetics/private/dnadna/dnadna/defaults

# options for describing the format in which the dataset is
# organized; currently only one format ("dnadna", the native format
# for DNADNA) is understood, but others may be added later
data_source:
  # string template for per-replicate simulation files in Python
  # string template format; the following template variables may be
  # used: 'name', the same as the name property used in this config
  # file; 'scenario', the scenario number, and 'replicate', the
  # replicate number of the scenario (if there are multiple
  # replicates); path separators may also be used in the template to
  # form a directory structure
  filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

  # a unique label identifying the data format; the format property
  # determines what reader is used for simulation data, and any
  # further options in data_source may depend on the format
  format: dnadna

  # keys in the NPZ file for the SNP matrix and position array
  # respectively; the "dnadna" format usually prescribes this to be
  # ["SNP", "POS"] but it can be overridden by this property
  keys:
  - SNP
  - POS

# a name to give the dataset; used in generating filenames and
# logging output
dataset_name: generic

# ignore missing replicates when loading data samples; in the case
# of missing samples the next one is tried until one is found
ignore_missing_replicates: false

# ignore missing scenario when loading data samples; a scenario is
# considered as missing  if all its replicates are missing or if the
# folder of the scenario doesn't exist.
ignore_missing_scenario: true

# path to the CSV file containing the per-scenario parameters used
# in this simulation, either as an absolute path, or as a path
# relative to this config file
scenario_params_path: /builds/ml_genetics/private/dnadna/dnadna/defaults/scenario_params.csv

Simulation config schema

The simulation config format is the same as the one for dataset but with an additional simulator_name property, as well as Simulator-specific properties.

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/simulation.yml"
type: "object"
description: >-
    JSON Schema (YAML-formatted) for basic properties of a simulation on which
    a model will be trained.
allOf:
    - {"$ref": "py-pkgdata:dnadna.schemas/dataset.yml"}
    -
        properties:
            plugins: {"$ref": "py-pkgdata:dnadna.schemas/plugins.yml"}

            simulator_name:
                type: "string"
                description: >-
                    for simulations output by the `dnadna simulation` command, the
                    name of the simulation class used to initialize and run the
                    simulation

            n_scenarios:
                type: "integer"
                description: >-
                    number of different scenarios simulated; each scenario
                    is a parameterization of the simulation with a different
                    set of (possibly random) parameter values; each scenario
                    may have one or more "replicates"--simulations using the
                    same parameters, but with different randomized
                    outputs--the number of replicates of each scenario
                    should be listed in the scenario parameters table
                minimum: 1
                default: 1

            seed:
                type: ["integer", "null"]
                description: >-
                    fixed seed to use for seeding the random number
                    generator at the beginning of the simulation; if absent
                    then the PRNG's default seeding method is used
                default: null

            summary_statistics: {"$ref": "py-pkgdata:dnadna.schemas/summary-statistics.yml#/definitions/summary_statistics"}

        required:
            - n_scenarios
            - scenario_params_path

    - {"$ref": "py-obj:dnadna.schemas.plugins.simulator"}

Example

# JSON Schema (YAML-formatted) for basic properties of a simulation
# on which a model will be trained.

# used only during training, keeps the validation set cached in-
# memory, which can greatly speed up evaluation; however, if the
# validation set is too large to fit in available memory this can be
# disabled
cache_validation_set: false

# root directory for all files related to the dataset, either as an
# absolute path, or as a path relative to the location of this
# config file
data_root: /builds/ml_genetics/private/dnadna/dnadna/defaults

# options for describing the format in which the dataset is
# organized; currently only one format ("dnadna", the native format
# for DNADNA) is understood, but others may be added later
data_source:
  # string template for per-replicate simulation files in Python
  # string template format; the following template variables may be
  # used: 'name', the same as the name property used in this config
  # file; 'scenario', the scenario number, and 'replicate', the
  # replicate number of the scenario (if there are multiple
  # replicates); path separators may also be used in the template to
  # form a directory structure
  filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

  # a unique label identifying the data format; the format property
  # determines what reader is used for simulation data, and any
  # further options in data_source may depend on the format
  format: dnadna

  # keys in the NPZ file for the SNP matrix and position array
  # respectively; the "dnadna" format usually prescribes this to be
  # ["SNP", "POS"] but it can be overridden by this property
  keys:
  - SNP
  - POS

# a name to give the dataset; used in generating filenames and
# logging output
dataset_name: one_event
generation_time: 25

# ignore missing replicates when loading data samples; in the case
# of missing samples the next one is tried until one is found
ignore_missing_replicates: false

# ignore missing scenario when loading data samples; a scenario is
# considered as missing  if all its replicates are missing or if the
# folder of the scenario doesn't exist.
ignore_missing_scenario: true
max: 4.698970004336019
mutation_rate: 1.0e-08
n_max: 4.698970004336019
n_min: 3.6989700043360187
n_replicates: 3
n_samples: 50

# number of different scenarios simulated; each scenario is a
# parameterization of the simulation with a different set of
# (possibly random) parameter values; each scenario may have one or
# more "replicates"--simulations using the same parameters, but with
# different randomized outputs--the number of replicates of each
# scenario should be listed in the scenario parameters table
n_scenarios: 100
recombination_rate: 1.0e-08

# path to the CSV file containing the per-scenario parameters used
# in this simulation, either as an absolute path, or as a path
# relative to this config file
scenario_params_path: /builds/ml_genetics/private/dnadna/docs/one_event_params.csv

# fixed seed to use for seeding the random number generator at the
# beginning of the simulation; if absent then the PRNG's default
# seeding method is used
seed: null
segment_length: 2000000.0

# for simulations output by the `dnadna simulation` command, the
# name of the simulation class used to initialize and run the
# simulation
simulator_name: one_event
tmax: 100000
tmin: 2000

Preprocessing config schema

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/preprocessing.yml"
type: "object"

description: >-
    required configuration for the `dnadna preprocess` command

properties:
    dataset:
        description: the dataset/simulation configuration
        "$ref": "py-pkgdata:dnadna.schemas/dataset.yml"

    model_root:
        type: "string"
        format: "filename!"
        description: >-
            root directory for all training runs of this model / training
            configuration
        default: "."

    model_name:
        type: "string"
        description: >-
            unique name to give to models trained with this configuration;
            individual training runs will prepend this to the run_id
        minLength: 1

    learned_params:
        description: >-
            description of the parameters the network will be trained on
        "$ref": "py-pkgdata:dnadna.schemas/param-set.yml"

    dataset_splits:
        description: >-
            how to split the dataset between training, validation, and test
            sets

            numbers given for each subset are ratios which must sum to 1; if
            less than 1 some portion of the dataset will be omitted, and if
            more than 1 an error is raised

            dataset splits are performed after unusable scenarios are omitted
            according to the pre-processing parameters (min_snp, etc.)
        type: "object"
        properties:
            training:
                description: portion of the dataset to use for training
                type: "number"
                exclusiveMinimum: 0
                exclusiveMaximum: 1
            validation:
                description: portion of the dataset to use for validation
                type: "number"
                exclusiveMinimum: 0
                exclusiveMaximum: 1
            test:
                description: >-
                    portion of the dataset to use for the test set (optional)
                type: "number"
                minimum: 0
                exclusiveMaximum: 1
            unused:
                description: >-
                    portion of the dataset which will not be used (optional,
                    reserved for custom purposes)
                type: "number"
                minimum: 0
                exclusiveMaximum: 1
        required: ["training", "validation"]
        additionalProperties: false

    preprocessing:
        description: >-
            these are parameters used for data pre-processing prior to
            training; they determine the subset of the dataset that will be
            used for a training run
        type: "object"
        properties:
            min_snp:
                description: "minimum number of SNPs each sample should have"
                type: ["integer", "null"]
                minimum: 1
                default: null
            min_indiv:
                description: "minimum number of individuals in each sample"
                type: ["integer", "null"]
                minimum: 1
                default: null
            seed:
                description: >-
                    random seed to initialize PRNG; in particular
                    randomization is used during pre-processing to separate
                    scenarios into the training and validation sets, and
                    specifying a seed ensures the split is consistent
                    between runs
                type: ["integer", "null"]
                default: null
            n_workers:
                description: >-
                    if greater than 0, the number of worker processes to
                    use for preprocessing; using multiple workers can in
                    some cases speed up preprocessing
                type: "integer"
                minimum: 0
                default: 0

    dnadna_version: {"$ref": "py-pkgdata:dnadna.schemas/definitions.yml#/definitions/version"}

    plugins: {"$ref": "py-pkgdata:dnadna.schemas/plugins.yml"}

required:
    - dataset
    - model_root
    - model_name
    - learned_params
    - dataset_splits
    - preprocessing

Example

# required configuration for the `dnadna preprocess` command

# the dataset/simulation configuration
dataset:
  # used only during training, keeps the validation set cached in-
  # memory, which can greatly speed up evaluation; however, if the
  # validation set is too large to fit in available memory this can
  # be disabled
  cache_validation_set: false

  # root directory for all files related to the dataset, either as
  # an absolute path, or as a path relative to the location of this
  # config file
  data_root: /builds/ml_genetics/private/dnadna/dnadna/defaults

  # options for describing the format in which the dataset is
  # organized; currently only one format ("dnadna", the native
  # format for DNADNA) is understood, but others may be added later
  data_source:
    # string template for per-replicate simulation files in Python
    # string template format; the following template variables may
    # be used: 'name', the same as the name property used in this
    # config file; 'scenario', the scenario number, and 'replicate',
    # the replicate number of the scenario (if there are multiple
    # replicates); path separators may also be used in the template
    # to form a directory structure
    filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

    # a unique label identifying the data format; the format
    # property determines what reader is used for simulation data,
    # and any further options in data_source may depend on the
    # format
    format: dnadna

    # keys in the NPZ file for the SNP matrix and position array
    # respectively; the "dnadna" format usually prescribes this to
    # be ["SNP", "POS"] but it can be overridden by this property
    keys:
    - SNP
    - POS

  # a name to give the dataset; used in generating filenames and
  # logging output
  dataset_name: generic

  # ignore missing replicates when loading data samples; in the case
  # of missing samples the next one is tried until one is found
  ignore_missing_replicates: false

  # ignore missing scenario when loading data samples; a scenario is
  # considered as missing  if all its replicates are missing or if
  # the folder of the scenario doesn't exist.
  ignore_missing_scenario: true

  # path to the CSV file containing the per-scenario parameters used
  # in this simulation, either as an absolute path, or as a path
  # relative to this config file
  scenario_params_path: /builds/ml_genetics/private/dnadna/dnadna/defaults/scenario_params.csv

# how to split the dataset between training, validation, and test
# sets numbers given for each subset are ratios which must sum to 1;
# if less than 1 some portion of the dataset will be omitted, and if
# more than 1 an error is raised dataset splits are performed after
# unusable scenarios are omitted according to the pre-processing
# parameters (min_snp, etc.)
dataset_splits:
  # portion of the dataset to use for training
  training: 0.7

  # portion of the dataset to use for validation
  validation: 0.3

# description of the parameters the network will be trained on
learned_params:
  param1:
    log_transform: false
    loss_func: MSE
    loss_weight: 1
    tied_to_position: false
    type: regression
  param2:
    classes: 2
    loss_func: Cross Entropy
    loss_weight: 1
    type: classification

# unique name to give to models trained with this configuration;
# individual training runs will prepend this to the run_id
model_name: default

# root directory for all training runs of this model / training
# configuration
model_root: /builds/ml_genetics/private/dnadna/dnadna/defaults

# these are parameters used for data pre-processing prior to
# training; they determine the subset of the dataset that will be
# used for a training run
preprocessing:
  # minimum number of individuals in each sample
  min_indiv: 1

  # minimum number of SNPs each sample should have
  min_snp: 1

  # if greater than 0, the number of worker processes to use for
  # preprocessing; using multiple workers can in some cases speed up
  # preprocessing
  n_workers: 2

  # random seed to initialize PRNG; in particular randomization is
  # used during pre-processing to separate scenarios into the
  # training and validation sets, and specifying a seed ensures the
  # split is consistent between runs
  seed: null

Learned params schema

# JSON Schema (YAML-formatted) for details about parameters to learn in a
# training run
$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/param-set.yml"
description: >-
    details of parameters to learn in training; it may be a mapping of param
    names to param configurations, or a list thereof (this is the case when
    using YAML ordered mappings, which are translated into a lists of
    single-element mappings); in the latter case the specified order of the
    parameters is preserved when mapping parameters to optimization targets
oneOf:
    - {"$ref": "py-pkgdata:dnadna.schemas/param-set.yml#/definitions/parameters"}
    - type: "array"
      items: {"$ref": "py-pkgdata:dnadna.schemas/param-set.yml#/definitions/parameters"}
      minItems: 1
      errorMsg:
          minItems: at least one parameter must be declared in {property}

definitions:
    loss_func:
        description: >-
            name of the loss function to apply to this parameter; the name
            is the same as the class or function implementing the loss
            function (e.g. MSELoss) minus the "Loss" in the name and is
            case-insensitvie (e.g. "mse" for MSELoss); spaces are also
            allowed in the function name for clarity (e.g. "cross entropy")
            and are simply ignored when looking up the associated
            class/function; the default value depends on the parameter
            type; spaces
        type: "string"
        minLength: 1

    loss_weight:
        description: >-
            additional weight by which to multiply the parameter's loss
            after applying the loss function, allowing some parameters to
            be weighted more heavily than others; by default all parameters
            are weighted equally
        type: "number"
        minimum: 0
        maximum: 1
        default: 1

    parameters:
        description: a mapping of parameter names to their details
        type: "object"
        minProperties: 1
        errorMsg:
            minProperties: at least one parameter must be declared in {property}
        # We use additionalProperties here because the property names are the
        # parameter names, which are arbitrary strings; hence every key/value
        # pair in this object is assumed to be a parameter definition
        additionalProperties: {"$ref": "py-pkgdata:dnadna.schemas/param-set.yml#/definitions/parameter"}

    parameter:
        description: details about a single parameter
        type: "object"
        errorMsg:
            type: >-
                must be an object like:

                    param_name:
                        type: regression

                or:

                    param_name:
                        type: classification
                        classes:
                            - class1
                            - class2
                            - class3

                where classes can be a list of class names or just the number
                of classes
        required: ["type"]
        properties:
            type:
                description: >-
                    parameter type; either "regression" or "classification".
                    Classification parameters require the additional "classes"
                    property
                enum: ["regression", "classification"]

        # Select between either "regression" which has no other properties
        # and classification which requires the additional "classes"
        # properties.  This could possibly be expressed more succinctly with
        # JSONSchema Draft-07 conditionals, but this is roughly equivalent.
        #
        # TODO: This also implements different defaults for loss_func depending
        # on the parameter type; however I don't think it works yet to
        # automatically supply this default during validation; so that's a
        # special case that might have to be checked...
        oneOf:
            -
                properties:
                    type: {"const": "regression"}
                    loss_func:
                        "$ref": "#/definitions/loss_func"
                        default: "MSE"
                    loss_weight:
                        "$ref": "#/definitions/loss_weight"
                        default: 1
                    log_transform:
                        description: >-
                            whether or not a log transform should be applied to
                            this parameter's known values during
                            pre-processing; training is then performed with the
                            log values (regression parameters only)
                        type: "boolean"
                        default: false
                    tied_to_position:
                        description: >-
                            values of this parameter are SNP positions, so any
                            transformations or normalizations of the position
                            array must also be applied to this parameter during
                            training
                        type: "boolean"
                        default: false
                additionalProperties: false
            -
                properties:
                    type: {"const": "classification"}
                    loss_func:
                        "$ref": "py-pkgdata:dnadna.schemas/param-set.yml#/definitions/loss_func"
                        default: "Cross Entropy"
                    loss_weight:
                        "$ref": "py-pkgdata:dnadna.schemas/param-set.yml#/definitions/loss_weight"
                        default: 1
                    classes:
                        description: >-
                            classification parameters classes, either an
                            integer giving the number of classes in the
                            parameter, or an array to give explicit names to
                            the classes (one item for each class);  class names
                            can themselves be either strings, or integers
                            (which are converted automatically to strings, as
                            they are just labels for the classes)
                        type: ["integer", "array"]
                        items:
                            type: ["integer", "string"]
                        minimum: 1
                        minItems: 1
                    n_classes:
                        description: >-
                            after pre-processing, this property contains the number
                            of classes in a classification parameter; if the
                            "classes" property is an integer this is identical;
                            otherwise it is the length of the "classes" array;
                            normally this property should not be manually specified
                        type: "integer"
                        minimum: 1
                required: ["classes"]
                additionalProperties: false

Training config schema

# JSON Schema (YAML-formatted) for inference/training parameters file.
$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/training.yml"
type: "object"
description: >-
    the main training configuration, typically generated from an existing
    preprocessing config file
allOf:
    - properties:
          network:
              description:
                  "name and parameters of the neural net model to train"
              properties:
                  name:
                      description: "name of the network to train"
                  params:
                      description: >-
                          options specific to the neural net model being
                          trained; these are passed as keyword arguments to the
                          net's constructor (see dnadna.net module); the schema
                          for this property depends on which model is being
                          used--model-specific schemas are found in
                          dnadna/schemas/nets, though a model may also provide
                          its schema as a .schema attribute
              default:
                  name: "SPIDNA"
              "$ref": "py-obj:dnadna.schemas.plugins.network"

          optimizer:
              description: >-
                  name and parameters of the optimizer to use; all built-in
                  optimizers from the torch.optim package are available for use
                  here, and you can also provide a custom optimizer via a
                  plugin
              default:
                  name: "Adam"
                  params:
                      learning_rate: 0.001
                      weight_decay: 0
              "$ref": "py-obj:dnadna.schemas.plugins.optimizer"

          lr_scheduler:
              description: >-
                    name and parameters of the learning rate scheduler to use;
                    all built-in schedulers from the torch.optim.lr_scheduler
                    package are available for use here, and you can also provide
                    a custom scheduler via a plugin
              default: null
              anyOf:
                  - {"type": "null"}
                  - {"$ref": "py-obj:dnadna.schemas.plugins.lr_scheduler"}

          dataset_transforms:
              "$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/transforms"
              default: []

          n_epochs:
              description: >-
                  number of epochs over which to repeat the training process
              type: "integer"
              minimum: 1
              default: 1

          collate:
              description: >-
                options for the collate function applied to the batch. Different parameters
                are possible. snp_dim is the number of snp per batch. If set to max, each
                element of the batch is padded up to the maximum dimension of snp of the batch.
                If set to an int, all snp are padded/cutted to this dimension. indiv_dim
                works the same way for the number of individuals. value_fill is the number to
                fill with, set to -1 by default. It is also possible to chose from which sad the 
                padding is done. The default is pad_right and pad_bottom. If two dual variables are
                set to true (i.e. pad_right and pad_left), the padding will be done on both sides equally. 
                If the size does not match, it will pad in priority on the right side and on the bottom side.
              default:
                snp_dim: max
                indiv_dim: max
                value_fill: -1
                pad_right: True
                pad_left: False
                pad_bottom: True
                pad_top: False
              "$ref": "py-obj:dnadna.schemas.plugins.collate"

          loader_num_workers:
              description: "number of subprocesses to use for data loading"
              type: "integer"
              minimum: 0
              default: 0

          evaluation_interval:
              description: >-
                interval (number of batches processed) between two validation
                steps; for m evaluations per epoch, set to
                n_training_samples // (batch_size * m) where the number of
                training samples can be found in training logs
                this parameter is exclusive with n_evaluation_per_epoch
              type: ["integer", "null"]
              minimum: 1
              default: null

          n_evaluation_per_epoch:
              description: >-
                number of times to evaluate the model on the validation set
                per epoch; this parameter is exclusive with respect to 
                evaluation_interval
              type: ["integer", "null"]
              minimum: 1
              default: null

          device:
              description: >-
                  Device string. Options are 'None', 'cpu', or 'cuda', or '0' or '0,1,2,3'. 
                  Defaults to an empty string, which auto-selects the first available GPU, or CPU if no GPU is available."
              oneOf:
                  - type: "string"
              default: ''

          seed:
              description: >-
                  seed for initializing the PRNG prior to a training run for
                  reproducible results; if unspecified the PRNG chooses its
                  default seeding method
              type: ["integer", "null"]
              default: null

          model_filename_format:
              type: "string"
              description: >-
                  format string for the filename of the final output model; it
                  can use the template variables model_name, run_name, and/or
                  run_id, while the required variable "checkpoint" will be
                  replaced with names like "best", "last" and other
                  intermediate checkpoints
              minLength: 1
              default: "{model_name}_{run_name}_{checkpoint}_net.pth"

          run_name_format:
              description: >-
                  format string for the name given to this run for a sequence
                  of runs of the same model; the outputs of each run are placed
                  in subdirectories of <run_path>/<model_name> with the name of
                  this run; the format string can use the template variables
                  model_name, run_id, batch_size, learning_rate, network_name and n_epochs.
                  They can also use the parameters in dataset_transforms, using the format
                  {transformtype_transformparameter}, for example {crop_max_snp}.
                  For detailed examples, please refer to 
                  https://mlgenetics.gitlab.io/dnadna/training.html#computing-resources-i-o-information-misc
                  
              type: "string"
              minLength: 4
              default: "run_{run_id}"

          train_mean:
              "$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/param_stats"
              description: >-
                  mean of each regression parameter over the training set

          train_std:
              "$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/param_stats"
              description: >-
                  standard deviation of each regression parameter over the
                  training set

    # Inherits the preprocessing config format
    - {"$ref": "py-pkgdata:dnadna.schemas/preprocessing.yml"}



additionalProperties: true

definitions:
    transform_list:
        type: "array"
        items: {"$ref": "py-obj:dnadna.schemas.plugins.transform"}
        default: []
    transforms:
        description: >-
            list of transforms to apply to the dataset; all optional transforms
            are disabled by default unless specified here; transforms which
            don't take any parameters can be listed just by their name, whereas
            transforms which do take parameters are given as {'name': <name>,
            'param1':, 'param2':, ...}, where the params map param names
            (specific to the transform) to their values
        oneOf:
            - "$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/transform_list"
            - type: "object"
              properties:
                  training: {"$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/transform_list"}
                  validation: {"$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/transform_list"}
                  test: {"$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/transform_list"}
              patternProperties: {"^[a-zA-Z0-9_]+$": {"$ref": "py-pkgdata:dnadna.schemas/training.yml#/definitions/transform_list"}}
              additionalProperties: false
    param_stats:
        type: "object"
        description: >-
            map of learned param names to some numerical statistic (e.g.  mean,
            standard deviation, etc.) about the values of that parameter in the
            preprocessed scenario params table
        additionalProperties:
            type: "number"

Example

# the main training configuration, typically generated from an
# existing preprocessing config file

batch_size: 8

# options for the collate function applied to the batch. Different
# parameters are possible. snp_dim is the number of snp per batch.
# If set to max, each element of the batch is padded up to the
# maximum dimension of snp of the batch. If set to an int, all snp
# are padded/cutted to this dimension. indiv_dim works the same way
# for the number of individuals. value_fill is the number to fill
# with, set to -1 by default. It is also possible to chose from
# which sad the  padding is done. The default is pad_right and
# pad_bottom. If two dual variables are set to true (i.e. pad_right
# and pad_left), the padding will be done on both sides equally.  If
# the size does not match, it will pad in priority on the right side
# and on the bottom side.
collate:
  indiv_dim: max
  pad_bottom: true
  pad_left: false
  pad_right: true
  pad_top: false
  snp_dim: max
  value_fill: -1

# the dataset/simulation configuration
dataset:
  # used only during training, keeps the validation set cached in-
  # memory, which can greatly speed up evaluation; however, if the
  # validation set is too large to fit in available memory this can
  # be disabled
  cache_validation_set: false

  # root directory for all files related to the dataset, either as
  # an absolute path, or as a path relative to the location of this
  # config file
  data_root: /builds/ml_genetics/private/dnadna/dnadna/defaults

  # options for describing the format in which the dataset is
  # organized; currently only one format ("dnadna", the native
  # format for DNADNA) is understood, but others may be added later
  data_source:
    # string template for per-replicate simulation files in Python
    # string template format; the following template variables may
    # be used: 'name', the same as the name property used in this
    # config file; 'scenario', the scenario number, and 'replicate',
    # the replicate number of the scenario (if there are multiple
    # replicates); path separators may also be used in the template
    # to form a directory structure
    filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

    # a unique label identifying the data format; the format
    # property determines what reader is used for simulation data,
    # and any further options in data_source may depend on the
    # format
    format: dnadna

    # keys in the NPZ file for the SNP matrix and position array
    # respectively; the "dnadna" format usually prescribes this to
    # be ["SNP", "POS"] but it can be overridden by this property
    keys:
    - SNP
    - POS

  # a name to give the dataset; used in generating filenames and
  # logging output
  dataset_name: generic

  # ignore missing replicates when loading data samples; in the case
  # of missing samples the next one is tried until one is found
  ignore_missing_replicates: false

  # ignore missing scenario when loading data samples; a scenario is
  # considered as missing  if all its replicates are missing or if
  # the folder of the scenario doesn't exist.
  ignore_missing_scenario: true

  # path to the CSV file containing the per-scenario parameters used
  # in this simulation, either as an absolute path, or as a path
  # relative to this config file
  scenario_params_path: /builds/ml_genetics/private/dnadna/dnadna/defaults/scenario_params.csv

# how to split the dataset between training, validation, and test
# sets numbers given for each subset are ratios which must sum to 1;
# if less than 1 some portion of the dataset will be omitted, and if
# more than 1 an error is raised dataset splits are performed after
# unusable scenarios are omitted according to the pre-processing
# parameters (min_snp, etc.)
dataset_splits:
  # portion of the dataset to use for training
  training: 0.7

  # portion of the dataset to use for validation
  validation: 0.3

# list of transforms to apply to the dataset; all optional
# transforms are disabled by default unless specified here;
# transforms which don't take any parameters can be listed just by
# their name, whereas transforms which do take parameters are given
# as {'name': <name>, 'param1':, 'param2':, ...}, where the params
# map param names (specific to the transform) to their values
dataset_transforms:
- crop:
    keep_polymorphic_only: true
    max_indiv: null
    max_snp: 400
- snp_format: concat
- validate_snp:
    uniform_shape: false

# Device string. Options are 'None', 'cpu', or 'cuda', or '0' or
# '0,1,2,3'.  Defaults to an empty string, which auto-selects the
# first available GPU, or CPU if no GPU is available."
device: '0'

# interval (number of batches processed) between two validation
# steps; for m evaluations per epoch, set to n_training_samples //
# (batch_size * m) where the number of training samples can be found
# in training logs this parameter is exclusive with
# n_evaluation_per_epoch
evaluation_interval: 1

# description of the parameters the network will be trained on
learned_params:
  param1:
    log_transform: false
    loss_func: MSE
    loss_weight: 1
    tied_to_position: false
    type: regression
  param2:
    classes: 2
    loss_func: Cross Entropy
    loss_weight: 1
    type: classification

# number of subprocesses to use for data loading
loader_num_workers: 1

# name and parameters of the learning rate scheduler to use; all
# built-in schedulers from the torch.optim.lr_scheduler package are
# available for use here, and you can also provide a custom
# scheduler via a plugin
lr_scheduler: null

# format string for the filename of the final output model; it can
# use the template variables model_name, run_name, and/or run_id,
# while the required variable "checkpoint" will be replaced with
# names like "best", "last" and other intermediate checkpoints
model_filename_format: '{model_name}_{run_name}_{checkpoint}_net.pth'

# unique name to give to models trained with this configuration;
# individual training runs will prepend this to the run_id
model_name: default

# root directory for all training runs of this model / training
# configuration
model_root: /builds/ml_genetics/private/dnadna/dnadna/defaults

# number of epochs over which to repeat the training process
n_epochs: 1

# number of times to evaluate the model on the validation set per
# epoch; this parameter is exclusive with respect to
# evaluation_interval
n_evaluation_per_epoch: null

# name and parameters of the neural net model to train
network:
  name: CustomCNN

  # net parameters for CNN
  params: {}

# name and parameters of the optimizer to use; all built-in
# optimizers from the torch.optim package are available for use
# here, and you can also provide a custom optimizer via a plugin
optimizer:
  name: Adam
  params:
    amsgrad: false
    betas:
    - 0.9
    - 0.999
    capturable: false
    differentiable: false
    eps: 1.0e-08
    foreach: null
    fused: null
    learning_rate: 0.001
    maximize: false
    weight_decay: 0

# these are parameters used for data pre-processing prior to
# training; they determine the subset of the dataset that will be
# used for a training run
preprocessing:
  # minimum number of individuals in each sample
  min_indiv: 1

  # minimum number of SNPs each sample should have
  min_snp: 1

  # if greater than 0, the number of worker processes to use for
  # preprocessing; using multiple workers can in some cases speed up
  # preprocessing
  n_workers: 2

  # random seed to initialize PRNG; in particular randomization is
  # used during pre-processing to separate scenarios into the
  # training and validation sets, and specifying a seed ensures the
  # split is consistent between runs
  seed: null

# format string for the name given to this run for a sequence of
# runs of the same model; the outputs of each run are placed in
# subdirectories of <run_path>/<model_name> with the name of this
# run; the format string can use the template variables model_name,
# run_id, batch_size, learning_rate, network_name and n_epochs. They
# can also use the parameters in dataset_transforms, using the
# format {transformtype_transformparameter}, for example
# {crop_max_snp}. For detailed examples, please refer to
# https://mlgenetics.gitlab.io/dnadna/training.html#computing-
# resources-i-o-information-misc
run_name_format: run_{run_id}

# seed for initializing the PRNG prior to a training run for
# reproducible results; if unspecified the PRNG chooses its default
# seeding method
seed: null

Summary statistics config schema

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/summary-statistics.yml"
type: "object"
description: >-
    summary statistics configuration: as summary statistics require reference
    to a simulation configuration for the simulation data to read, this
    requires a reference to a simulation config, either embedded or inherited
    from an external file; alternatively, a simulation config file may also
    contain embedded an embedded summary statistics config, in its
    `summary_statistics` property, for example,

    either a summary statistics config with an embedded/inherited simulation
    config::

        chromsome_size: 2e6
        # .. additional summary statistics properties ...
        simulation:
            # ... simulation config properties, or inherit: ...

    or you can use a simulation config with an embedded summary statistics
    config::

        data_root: "."
        name: "my_simulation"
        # ... additional simulation properties ...
        summary_statistics:
            # ... summary statistics config properties, or in inherit: ...

definitions:
    summary_statistics:
        type: "object"
        description: >-
            settings for calculating and outputting summary statistics on this
            simulation
        properties:
            plugins: {"$ref": "py-pkgdata:dnadna.schemas/plugins.yml"}

            filename_format:
                type: "string"
                description: >-
                    string template for per-secenario summary statistics files;
                    for each scenario three statistics tables are output: the
                    LD (Linkage Disequilibrium) scores, SFS (Site Frequency
                    Spectrum), and the "sel" file containing additional test
                    statistics such as Tajima's D, iHS, nSL, and possibly
                    others to be implemented; this template contains up to 3
                    variables, the 'name' of the dataset, the 'scenario'
                    (integer scenario index) and 'stat' ('ld', 'sfs', or 'sel')
                default: "sumstats/scenario_{scenario}/{dataset_name}_{scenario}_{type}.csv"

            chromosome_size:
                type: "number"
                description: >-
                    number of SNP pairs in the chromosome
                # TODO: This seems arbitrary; why this number?  Should there
                # even be a default at all?
                default: 2.0e+6

            ld_options:
                type: "object"
                description: >-
                    options to pass to the LD computation
                default: {}
                properties:
                    circular:
                        type: "boolean"
                        description: >-
                            whether or not circular chromosomes are being
                            considered
                        default: false
                    distance_bins:
                        type: ["array", "integer"]
                        description: >-
                            distance bins into which to group SNPs; LD is then
                            averaged over those bins; either an array of
                            distance groups, or an integer giving the number of
                            bins to create over log space in max distance
                        default: 19  # TODO: Why 19?
                        minLength: 1
                        minimum: 1

            sfs_options:
                type: "object"
                description: >-
                    options to pass to the SFS computation
                default: {}
                properties:
                    folded:
                        type: "boolean"
                        description: >-
                            whether or not to compute the folded SFS
                        default: false

            sel_options:
                type: "object"
                description: >-
                    options to pass to the additional sel statistics
                default: {}
                properties:
                    window:
                        type: ["integer", "null"]
                        description: >-
                            number of bins into which to slice SNP positions;
                            the statistic is then computed over each window
                            instead of over all sites together; if the value is
                            0, the statistics are not binned
                        default: 100  # TODO: Why 100??
                        minimum: 0

oneOf:
    - allOf:
        - {"$ref": "#/definitions/summary_statistics"}
        -
            properties:
                simulation: {"$ref": "py-pkgdata:dnadna.schemas/simulation.yml"}
            required: ["simulation"]
    - allOf:
        - {"$ref": "py-pkgdata:dnadna.schemas/simulation.yml"}
        - {"required": ["summary_statistics"]}